wine_data <-
read.csv(here::here("data", "input", "winemag-data-130k-v2.csv"))
colors <- read.csv(here::here("data", "input", "wine-colors.csv"))
twitter_stats <- read.csv(here::here("data", "input", "twitter-data.csv"))
# Add color
# wine_data_with_color <- left_join(wine_data, colors, by = "variety")
wine_data <- dplyr::left_join(wine_data, colors, by = "variety")
# Add in twitter stats
wine_data <- dplyr::left_join(wine_data, twitter_stats, by = "taster_twitter_handle")Apply the cleaning strategies from above
wine_data <-
wine_data %>% dplyr::mutate(
country = as.character(country),
variety = as.character(variety),
taster_name = as.character(taster_name),
title = as.character(title),
color_lump = fct_lump(color, n = 2),
province_lump = fct_lump(wine_data$province, n = 10),
country_lump = fct_lump(wine_data$country, n = 10)
) %>%
dplyr::filter (country != "" &
variety != "" & taster_name != "") %>% drop_na(price) Turn score into categories per https://www.winespectator.com/articles/scoring-scale
Turn score into categories per https://www.winespectator.com/articles/scoring-scale
Has accents presuming the american market might pay more … or less for wines that sound exotic.
wine_designations_word_cloud <-
wine_designations_word_cloud %>% mutate(designation = str_replace_all(designation, "([Rr].serv.)", "Reserve"))
## Calculate Corpus
wineDesignation.Corpus <-
Corpus(VectorSource(wine_designations_word_cloud$designation))
#Data Cleaning and Wrangling
wineDesignation.clean <- tm_map(wineDesignation.Corpus,
PlainTextDocument)
wineDesignation.clean <-
tm_map(wineDesignation.Corpus, tolower)
wineDesignation.clean <-
tm_map(wineDesignation.Corpus, removeNumbers)
wineDesignation.clean <-
tm_map(wineDesignation.Corpus, removeWords, stopwords("english"))
wineDesignation.clean <-
tm_map(wineDesignation.Corpus, removePunctuation)
wineDesignation.clean <-
tm_map(wineDesignation.Corpus, stripWhitespace)
wordcloud(words = wineDesignation.clean, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))# List of regular expressions to match
patterns <-
c(
".*([Rr].serv).*",
".*extra dry.*",
".*(dry|trocken).*",
".*brut.*",
".*(estate|grand|casa).*",
".*single.*",
".*(klassik|classic|tradition|vintage).*",
".*rose.*",
".*barrel s.*",
".*(old v|vieilles).*",
".*(vineyard|ranch|alpha|branco|broquel).*",
".*(barrel|crianza|cuve).*",
".*unoaked.*",
".*cuve prestige.*",
".*(blanc|white|bianco).*",
".*(red|tinto|bussia).*",
".*(nouveau|proprietary|signature|selec|premier).*",
".*lot.*",
".*late.*",
".*(oak|roble).*",
".*(organic|cannubi).*",
".*(port|colheita).*",
".*(collection|premium|prestige|limited).*",
".*clone.*",
".*(block|bin).*",
"^$"
)
# Values to replace above patterns with
replacements <-
c(
"Reserve",
"Extra Dry",
"Dry",
"Brut",
"Estate",
"Single Vineyard",
"Classic Vintage",
"Rose",
"Barrel Sample",
"Old Vine",
"Some Vineyard",
"Barrel",
"UnOaked",
"Finest Champagne",
"White",
"Red",
"Signature",
"Lot",
"Late Harvest",
"Oak",
"Organic",
"Port",
"Premium",
"Clone",
"Block",
"No Designation"
)
# This is so that we can use the replacements object in str_replace_all
# rather than a single pattern/replacement
names(replacements) <- patterns
wine_data <-
wine_data %>% dplyr::mutate(designation = str_replace_all(designation, replacements)) %>%
dplyr::mutate(designation = as.factor(designation)).append_color_to_factor <- function(variety, color) {
variety = as.character(variety)
variety = paste(variety, color)
return(variety)
}
red <-
wine_data %>% dplyr::filter(color_lump == "Red") %>%
dplyr::mutate(variety_lump = fct_lump(variety, n = 5)) %>%
dplyr::mutate(variety_lump = .append_color_to_factor(variety_lump, "(R)"))
white <-
wine_data %>% dplyr::filter(color_lump == "White") %>%
dplyr::mutate(variety_lump = fct_lump(variety, n = 5)) %>%
dplyr::mutate(variety_lump = .append_color_to_factor(variety_lump, "(W)"))
other <-
wine_data %>% dplyr::filter(color_lump == "Other") %>%
dplyr::mutate(variety_lump = "Other")
wine_data_bind <- do.call("rbind", list(red, white, other))
wine_data <-
wine_data_bind %>% dplyr::mutate(variety_lump = factor(variety_lump))wine_data <-
wine_data %>% dplyr::filter(as.character(taster_twitter_handle) != "") %>% dplyr::mutate(
taster_name_lump = fct_lump(taster_name, n = 5),
taster_twitter_lump = fct_lump(taster_twitter_handle, n = 5),
designation_lump = fct_lump(designation, n = 10),
country_lump = fct_lump(country, n = 10),
variety_lump = fct_lump(variety, n = 10)
)wine_data_clean <-
wine_data %>%
dplyr::select(
ID,
price,
country,
variety,
points,
point_cat,
title_length,
title_has_accents,
variety_lump,
designation_lump,
taster_name_lump,
taster_twitter_lump,
taster_gender,
taster_avg_points,
taster_review_count,
taster_n_tweets,
color_lump,
country_lump,
province_lump,
price_cat,
title_word_count,
title_sentement
) %>% droplevels()## [1] "X" "description"
## [3] "designation" "province"
## [5] "region_1" "region_2"
## [7] "taster_name" "taster_twitter_handle"
## [9] "title" "winery"
## [11] "color" "taster_n_followers"
## [13] "title_no_accents"
table_numeric <-
tableby(~ ., data = wine_data_clean %>%
select_if(is.numeric))
summary(table_numeric, title = "Wine Data by Price - Numeric Columns")| Overall (N=91500) | |
|---|---|
| ID | |
| Â Â Â Mean (SD) | 45750.500 (26413.919) |
| Â Â Â Range | 1.000 - 91500.000 |
| price | |
| Â Â Â Mean (SD) | 35.287 (43.291) |
| Â Â Â Range | 4.000 - 3300.000 |
| points | |
| Â Â Â Mean (SD) | 88.610 (2.967) |
| Â Â Â Range | 80.000 - 100.000 |
| title_length | |
| Â Â Â Mean (SD) | 52.749 (13.742) |
| Â Â Â Range | 12.000 - 136.000 |
| taster_avg_points | |
| Â Â Â Mean (SD) | 88.610 (0.950) |
| Â Â Â Range | 86.610 - 90.613 |
| taster_review_count | |
| Â Â Â Mean (SD) | 11256.584 (5946.771) |
| Â Â Â Range | 6.000 - 20172.000 |
| taster_n_tweets | |
| Â Â Â Mean (SD) | 2660.376 (4126.567) |
| Â Â Â Range | 257.000 - 19200.000 |
| title_word_count | |
| Â Â Â Mean (SD) | 6.631 (2.087) |
| Â Â Â Range | 2.000 - 23.000 |
| title_sentement | |
| Â Â Â Mean (SD) | 0.033 (0.113) |
| Â Â Â Range | -0.875 - 1.071 |
table_factor <-
tableby(~ ., data = wine_data_clean %>%
select_if(is.factor))
summary(table_factor, title = "Wine Data by Price - Factor Columns")| Overall (N=91500) | |
|---|---|
| point_cat | |
| Â Â Â Good | 7465 (8.2%) |
| Â Â Â Very good | 47803 (52.2%) |
| Â Â Â Outstanding | 34576 (37.8%) |
| Â Â Â Classic | 1656 (1.8%) |
| variety_lump | |
| Â Â Â Bordeaux-style Red Blend | 4700 (5.1%) |
| Â Â Â Cabernet Sauvignon | 6090 (6.7%) |
| Â Â Â Chardonnay | 7935 (8.7%) |
| Â Â Â Malbec | 2509 (2.7%) |
| Â Â Â Pinot Noir | 9622 (10.5%) |
| Â Â Â Red Blend | 6737 (7.4%) |
| Â Â Â Riesling | 2583 (2.8%) |
|    Rosé | 2868 (3.1%) |
| Â Â Â Sauvignon Blanc | 3755 (4.1%) |
| Â Â Â Syrah | 3075 (3.4%) |
| Â Â Â Other | 41626 (45.5%) |
| designation_lump | |
| Â Â Â Barrel | 1983 (2.2%) |
| Â Â Â Brut | 2390 (2.6%) |
| Â Â Â Classic Vintage | 932 (1.0%) |
| Â Â Â Estate | 4790 (5.2%) |
| Â Â Â No Designation | 25584 (28.0%) |
| Â Â Â Old Vine | 922 (1.0%) |
| Â Â Â Red | 1014 (1.1%) |
| Â Â Â Reserve | 8815 (9.6%) |
| Â Â Â Signature | 1953 (2.1%) |
| Â Â Â Some Vineyard | 8199 (9.0%) |
| Â Â Â Other | 34918 (38.2%) |
| taster_name_lump | |
|    Kerin O’Keefe | 9874 (10.8%) |
| Â Â Â Michael Schachner | 14944 (16.3%) |
| Â Â Â Paul Gregutt | 9497 (10.4%) |
| Â Â Â Roger Voss | 20172 (22.0%) |
| Â Â Â Virginie Boone | 9507 (10.4%) |
| Â Â Â Other | 27506 (30.1%) |
| taster_twitter_lump | |
| Â Â Â @kerinokeefe | 9874 (10.8%) |
| Â Â Â @paulgwine | 9497 (10.4%) |
| Â Â Â @vboone | 9507 (10.4%) |
| Â Â Â @vossroger | 20172 (22.0%) |
| Â Â Â @wineschach | 14944 (16.3%) |
| Â Â Â Other | 27506 (30.1%) |
| taster_gender | |
| Â Â Â F | 25583 (28.0%) |
| Â Â Â M | 65917 (72.0%) |
| color_lump | |
| Â Â Â Red | 57244 (62.6%) |
| Â Â Â White | 28103 (30.7%) |
| Â Â Â Other | 6153 (6.7%) |
| country_lump | |
| Â Â Â Argentina | 3753 (4.1%) |
| Â Â Â Australia | 2007 (2.2%) |
| Â Â Â Austria | 2791 (3.1%) |
| Â Â Â Chile | 4305 (4.7%) |
| Â Â Â France | 17525 (19.2%) |
| Â Â Â Italy | 10121 (11.1%) |
| Â Â Â New Zealand | 1270 (1.4%) |
| Â Â Â Portugal | 4870 (5.3%) |
| Â Â Â Spain | 6509 (7.1%) |
| Â Â Â US | 34535 (37.7%) |
| Â Â Â Other | 3814 (4.2%) |
| province_lump | |
| Â Â Â Bordeaux | 3990 (4.4%) |
| Â Â Â Burgundy | 3090 (3.4%) |
| Â Â Â California | 20042 (21.9%) |
| Â Â Â Mendoza Province | 3223 (3.5%) |
| Â Â Â Northern Spain | 3769 (4.1%) |
| Â Â Â Oregon | 5323 (5.8%) |
| Â Â Â Piedmont | 2111 (2.3%) |
| Â Â Â Tuscany | 3276 (3.6%) |
| Â Â Â Veneto | 1036 (1.1%) |
| Â Â Â Washington | 8575 (9.4%) |
| Â Â Â Other | 37065 (40.5%) |
| price_cat | |
| Â Â Â N-Miss | 1 |
| Â Â Â Budget ($0-4) | 7 (0.0%) |
| Â Â Â Every Day ($5-12) | 9887 (10.8%) |
| Â Â Â Premium ($13-50) | 66712 (72.9%) |
| Â Â Â Ultra Premium ($51-200) | 14337 (15.7%) |
| Â Â Â Luxury ($201-750) | 521 (0.6%) |
| Â Â Â What the fuck is wrong with you? ($751+) | 35 (0.0%) |
table_complete <-
tableby(price_cat ~ ., data = wine_data_clean %>%
select(
-c(ID, country, variety)
))
summary(table_complete, title = "Wine Data by Price Group")| Budget ($0-4) (N=7) | Every Day ($5-12) (N=9887) | Premium ($13-50) (N=66712) | Ultra Premium ($51-200) (N=14337) | Luxury ($201-750) (N=521) | What the fuck is wrong with you? ($751+) (N=35) | Total (N=91499) | p value | |
|---|---|---|---|---|---|---|---|---|
| price | < 0.001 | |||||||
| Â Â Â Mean (SD) | 4.000 (0.000) | 10.406 (1.489) | 26.752 (10.615) | 78.552 (28.051) | 328.814 (113.631) | 1153.171 (509.730) | 35.252 (41.924) | |
| Â Â Â Range | 4.000 - 4.000 | 5.000 - 12.000 | 13.000 - 50.000 | 51.000 - 200.000 | 202.000 - 750.000 | 757.000 - 2500.000 | 4.000 - 2500.000 | |
| points | < 0.001 | |||||||
| Â Â Â Mean (SD) | 83.857 (1.069) | 85.473 (1.989) | 88.447 (2.601) | 91.313 (2.501) | 94.180 (2.486) | 95.686 (3.104) | 88.610 (2.967) | |
| Â Â Â Range | 82.000 - 85.000 | 80.000 - 93.000 | 80.000 - 98.000 | 81.000 - 100.000 | 83.000 - 100.000 | 87.000 - 100.000 | 80.000 - 100.000 | |
| point_cat | < 0.001 | |||||||
| Â Â Â Good | 5 (71.4%) | 2895 (29.3%) | 4437 (6.7%) | 127 (0.9%) | 1 (0.2%) | 0 (0.0%) | 7465 (8.2%) | |
| Â Â Â Very good | 2 (28.6%) | 6771 (68.5%) | 38117 (57.1%) | 2880 (20.1%) | 30 (5.8%) | 2 (5.7%) | 47802 (52.2%) | |
| Â Â Â Outstanding | 0 (0.0%) | 221 (2.2%) | 23850 (35.8%) | 10261 (71.6%) | 236 (45.3%) | 8 (22.9%) | 34576 (37.8%) | |
| Â Â Â Classic | 0 (0.0%) | 0 (0.0%) | 308 (0.5%) | 1069 (7.5%) | 254 (48.8%) | 25 (71.4%) | 1656 (1.8%) | |
| title_length | < 0.001 | |||||||
| Â Â Â Mean (SD) | 49.429 (11.886) | 50.731 (13.552) | 52.440 (13.489) | 55.721 (14.521) | 49.631 (14.919) | 42.400 (9.781) | 52.749 (13.742) | |
| Â Â Â Range | 38.000 - 66.000 | 12.000 - 124.000 | 16.000 - 136.000 | 16.000 - 136.000 | 21.000 - 113.000 | 28.000 - 69.000 | 12.000 - 136.000 | |
| title_has_accents | < 0.001 | |||||||
| Â Â Â FALSE | 7 (100.0%) | 6505 (65.8%) | 47217 (70.8%) | 11265 (78.6%) | 314 (60.3%) | 14 (40.0%) | 65322 (71.4%) | |
| Â Â Â TRUE | 0 (0.0%) | 3382 (34.2%) | 19495 (29.2%) | 3072 (21.4%) | 207 (39.7%) | 21 (60.0%) | 26177 (28.6%) | |
| variety_lump | < 0.001 | |||||||
| Â Â Â Bordeaux-style Red Blend | 0 (0.0%) | 281 (2.8%) | 3454 (5.2%) | 898 (6.3%) | 57 (10.9%) | 9 (25.7%) | 4699 (5.1%) | |
| Â Â Â Cabernet Sauvignon | 1 (14.3%) | 722 (7.3%) | 3639 (5.5%) | 1684 (11.7%) | 44 (8.4%) | 0 (0.0%) | 6090 (6.7%) | |
| Â Â Â Chardonnay | 1 (14.3%) | 858 (8.7%) | 5784 (8.7%) | 1227 (8.6%) | 61 (11.7%) | 4 (11.4%) | 7935 (8.7%) | |
| Â Â Â Malbec | 0 (0.0%) | 399 (4.0%) | 1842 (2.8%) | 260 (1.8%) | 8 (1.5%) | 0 (0.0%) | 2509 (2.7%) | |
| Â Â Â Pinot Noir | 0 (0.0%) | 241 (2.4%) | 6141 (9.2%) | 3117 (21.7%) | 119 (22.8%) | 4 (11.4%) | 9622 (10.5%) | |
| Â Â Â Red Blend | 0 (0.0%) | 419 (4.2%) | 5215 (7.8%) | 1070 (7.5%) | 33 (6.3%) | 0 (0.0%) | 6737 (7.4%) | |
| Â Â Â Riesling | 0 (0.0%) | 235 (2.4%) | 2077 (3.1%) | 251 (1.8%) | 20 (3.8%) | 0 (0.0%) | 2583 (2.8%) | |
|    Rosé | 0 (0.0%) | 651 (6.6%) | 2183 (3.3%) | 33 (0.2%) | 0 (0.0%) | 1 (2.9%) | 2868 (3.1%) | |
| Â Â Â Sauvignon Blanc | 0 (0.0%) | 729 (7.4%) | 2952 (4.4%) | 74 (0.5%) | 0 (0.0%) | 0 (0.0%) | 3755 (4.1%) | |
| Â Â Â Syrah | 1 (14.3%) | 124 (1.3%) | 2362 (3.5%) | 577 (4.0%) | 11 (2.1%) | 0 (0.0%) | 3075 (3.4%) | |
| Â Â Â Other | 4 (57.1%) | 5228 (52.9%) | 31063 (46.6%) | 5146 (35.9%) | 168 (32.2%) | 17 (48.6%) | 41626 (45.5%) | |
| designation_lump | < 0.001 | |||||||
| Â Â Â Barrel | 0 (0.0%) | 128 (1.3%) | 1679 (2.5%) | 167 (1.2%) | 9 (1.7%) | 0 (0.0%) | 1983 (2.2%) | |
| Â Â Â Brut | 0 (0.0%) | 108 (1.1%) | 1441 (2.2%) | 795 (5.5%) | 45 (8.6%) | 1 (2.9%) | 2390 (2.6%) | |
| Â Â Â Classic Vintage | 0 (0.0%) | 89 (0.9%) | 691 (1.0%) | 148 (1.0%) | 4 (0.8%) | 0 (0.0%) | 932 (1.0%) | |
| Â Â Â Estate | 1 (14.3%) | 378 (3.8%) | 3454 (5.2%) | 945 (6.6%) | 12 (2.3%) | 0 (0.0%) | 4790 (5.2%) | |
| Â Â Â No Designation | 2 (28.6%) | 3498 (35.4%) | 19768 (29.6%) | 2112 (14.7%) | 182 (34.9%) | 21 (60.0%) | 25583 (28.0%) | |
| Â Â Â Old Vine | 0 (0.0%) | 78 (0.8%) | 737 (1.1%) | 106 (0.7%) | 1 (0.2%) | 0 (0.0%) | 922 (1.0%) | |
| Â Â Â Red | 1 (14.3%) | 191 (1.9%) | 684 (1.0%) | 133 (0.9%) | 5 (1.0%) | 0 (0.0%) | 1014 (1.1%) | |
| Â Â Â Reserve | 0 (0.0%) | 1078 (10.9%) | 6321 (9.5%) | 1390 (9.7%) | 25 (4.8%) | 1 (2.9%) | 8815 (9.6%) | |
| Â Â Â Signature | 0 (0.0%) | 216 (2.2%) | 1058 (1.6%) | 661 (4.6%) | 18 (3.5%) | 0 (0.0%) | 1953 (2.1%) | |
| Â Â Â Some Vineyard | 0 (0.0%) | 247 (2.5%) | 5760 (8.6%) | 2179 (15.2%) | 13 (2.5%) | 0 (0.0%) | 8199 (9.0%) | |
| Â Â Â Other | 3 (42.9%) | 3876 (39.2%) | 25119 (37.7%) | 5701 (39.8%) | 207 (39.7%) | 12 (34.3%) | 34918 (38.2%) | |
| taster_name_lump | < 0.001 | |||||||
|    Kerin O’Keefe | 0 (0.0%) | 311 (3.1%) | 7130 (10.7%) | 2355 (16.4%) | 77 (14.8%) | 1 (2.9%) | 9874 (10.8%) | |
| Â Â Â Michael Schachner | 5 (71.4%) | 3922 (39.7%) | 9829 (14.7%) | 1134 (7.9%) | 53 (10.2%) | 1 (2.9%) | 14944 (16.3%) | |
| Â Â Â Paul Gregutt | 0 (0.0%) | 453 (4.6%) | 7726 (11.6%) | 1316 (9.2%) | 2 (0.4%) | 0 (0.0%) | 9497 (10.4%) | |
| Â Â Â Roger Voss | 0 (0.0%) | 2671 (27.0%) | 13977 (21.0%) | 3201 (22.3%) | 295 (56.6%) | 27 (77.1%) | 20171 (22.0%) | |
| Â Â Â Virginie Boone | 0 (0.0%) | 174 (1.8%) | 6469 (9.7%) | 2819 (19.7%) | 45 (8.6%) | 0 (0.0%) | 9507 (10.4%) | |
| Â Â Â Other | 2 (28.6%) | 2356 (23.8%) | 21581 (32.3%) | 3512 (24.5%) | 49 (9.4%) | 6 (17.1%) | 27506 (30.1%) | |
| taster_twitter_lump | < 0.001 | |||||||
| Â Â Â @kerinokeefe | 0 (0.0%) | 311 (3.1%) | 7130 (10.7%) | 2355 (16.4%) | 77 (14.8%) | 1 (2.9%) | 9874 (10.8%) | |
| Â Â Â @paulgwine | 0 (0.0%) | 453 (4.6%) | 7726 (11.6%) | 1316 (9.2%) | 2 (0.4%) | 0 (0.0%) | 9497 (10.4%) | |
| Â Â Â @vboone | 0 (0.0%) | 174 (1.8%) | 6469 (9.7%) | 2819 (19.7%) | 45 (8.6%) | 0 (0.0%) | 9507 (10.4%) | |
| Â Â Â @vossroger | 0 (0.0%) | 2671 (27.0%) | 13977 (21.0%) | 3201 (22.3%) | 295 (56.6%) | 27 (77.1%) | 20171 (22.0%) | |
| Â Â Â @wineschach | 5 (71.4%) | 3922 (39.7%) | 9829 (14.7%) | 1134 (7.9%) | 53 (10.2%) | 1 (2.9%) | 14944 (16.3%) | |
| Â Â Â Other | 2 (28.6%) | 2356 (23.8%) | 21581 (32.3%) | 3512 (24.5%) | 49 (9.4%) | 6 (17.1%) | 27506 (30.1%) | |
| taster_gender | < 0.001 | |||||||
| Â Â Â F | 0 (0.0%) | 1146 (11.6%) | 18582 (27.9%) | 5726 (39.9%) | 127 (24.4%) | 2 (5.7%) | 25583 (28.0%) | |
| Â Â Â M | 7 (100.0%) | 8741 (88.4%) | 48130 (72.1%) | 8611 (60.1%) | 394 (75.6%) | 33 (94.3%) | 65916 (72.0%) | |
| taster_avg_points | < 0.001 | |||||||
| Â Â Â Mean (SD) | 87.399 (0.838) | 87.922 (0.930) | 88.661 (0.947) | 88.848 (0.760) | 88.521 (0.598) | 88.657 (0.513) | 88.610 (0.950) | |
| Â Â Â Range | 86.909 - 88.626 | 86.610 - 90.613 | 86.610 - 90.613 | 86.610 - 90.613 | 86.610 - 90.034 | 86.909 - 90.613 | 86.610 - 90.613 | |
| taster_review_count | < 0.001 | |||||||
| Â Â Â Mean (SD) | 11866.000 (5256.686) | 13140.757 (6192.002) | 10940.497 (5936.448) | 11251.639 (5506.096) | 15691.409 (5626.161) | 17117.943 (5946.637) | 11256.487 (5946.730) | |
| Â Â Â Range | 4171.000 - 14944.000 | 469.000 - 20172.000 | 6.000 - 20172.000 | 27.000 - 20172.000 | 469.000 - 20172.000 | 3389.000 - 20172.000 | 6.000 - 20172.000 | |
| taster_n_tweets | < 0.001 | |||||||
| Â Â Â Mean (SD) | 1152.000 (102.470) | 1932.965 (3350.320) | 2799.164 (4295.539) | 2565.903 (3808.197) | 1408.345 (1160.877) | 1296.400 (1191.907) | 2660.397 (4126.585) | |
| Â Â Â Range | 1002.000 - 1212.000 | 803.000 - 19200.000 | 257.000 - 19200.000 | 257.000 - 19200.000 | 803.000 - 19200.000 | 803.000 - 6748.000 | 257.000 - 19200.000 | |
| color_lump | < 0.001 | |||||||
| Â Â Â Red | 4 (57.1%) | 4902 (49.6%) | 40544 (60.8%) | 11386 (79.4%) | 383 (73.5%) | 24 (68.6%) | 57243 (62.6%) | |
| Â Â Â White | 3 (42.9%) | 4019 (40.6%) | 21846 (32.7%) | 2130 (14.9%) | 95 (18.2%) | 10 (28.6%) | 28103 (30.7%) | |
| Â Â Â Other | 0 (0.0%) | 966 (9.8%) | 4322 (6.5%) | 821 (5.7%) | 43 (8.3%) | 1 (2.9%) | 6153 (6.7%) | |
| country_lump | < 0.001 | |||||||
| Â Â Â Argentina | 1 (14.3%) | 955 (9.7%) | 2504 (3.8%) | 287 (2.0%) | 6 (1.2%) | 0 (0.0%) | 3753 (4.1%) | |
| Â Â Â Australia | 0 (0.0%) | 225 (2.3%) | 1400 (2.1%) | 362 (2.5%) | 16 (3.1%) | 4 (11.4%) | 2007 (2.2%) | |
| Â Â Â Austria | 0 (0.0%) | 100 (1.0%) | 2360 (3.5%) | 330 (2.3%) | 0 (0.0%) | 1 (2.9%) | 2791 (3.1%) | |
| Â Â Â Chile | 0 (0.0%) | 1518 (15.4%) | 2576 (3.9%) | 201 (1.4%) | 10 (1.9%) | 0 (0.0%) | 4305 (4.7%) | |
| Â Â Â France | 0 (0.0%) | 1519 (15.4%) | 12637 (18.9%) | 3068 (21.4%) | 277 (53.2%) | 23 (65.7%) | 17524 (19.2%) | |
| Â Â Â Italy | 0 (0.0%) | 367 (3.7%) | 7286 (10.9%) | 2390 (16.7%) | 77 (14.8%) | 1 (2.9%) | 10121 (11.1%) | |
| Â Â Â New Zealand | 0 (0.0%) | 50 (0.5%) | 1111 (1.7%) | 109 (0.8%) | 0 (0.0%) | 0 (0.0%) | 1270 (1.4%) | |
| Â Â Â Portugal | 0 (0.0%) | 1466 (14.8%) | 2940 (4.4%) | 437 (3.0%) | 23 (4.4%) | 4 (11.4%) | 4870 (5.3%) | |
| Â Â Â Spain | 4 (57.1%) | 1418 (14.3%) | 4441 (6.7%) | 609 (4.2%) | 36 (6.9%) | 1 (2.9%) | 6509 (7.1%) | |
| Â Â Â US | 2 (28.6%) | 1683 (17.0%) | 26588 (39.9%) | 6210 (43.3%) | 51 (9.8%) | 1 (2.9%) | 34535 (37.7%) | |
| Â Â Â Other | 0 (0.0%) | 586 (5.9%) | 2869 (4.3%) | 334 (2.3%) | 25 (4.8%) | 0 (0.0%) | 3814 (4.2%) | |
| province_lump | < 0.001 | |||||||
| Â Â Â Bordeaux | 0 (0.0%) | 462 (4.7%) | 3047 (4.6%) | 414 (2.9%) | 51 (9.8%) | 15 (42.9%) | 3989 (4.4%) | |
| Â Â Â Burgundy | 0 (0.0%) | 18 (0.2%) | 1782 (2.7%) | 1115 (7.8%) | 169 (32.4%) | 6 (17.1%) | 3090 (3.4%) | |
| Â Â Â California | 2 (28.6%) | 832 (8.4%) | 14884 (22.3%) | 4275 (29.8%) | 48 (9.2%) | 1 (2.9%) | 20042 (21.9%) | |
| Â Â Â Mendoza Province | 1 (14.3%) | 812 (8.2%) | 2148 (3.2%) | 256 (1.8%) | 6 (1.2%) | 0 (0.0%) | 3223 (3.5%) | |
| Â Â Â Northern Spain | 0 (0.0%) | 767 (7.8%) | 2541 (3.8%) | 432 (3.0%) | 29 (5.6%) | 0 (0.0%) | 3769 (4.1%) | |
| Â Â Â Oregon | 0 (0.0%) | 87 (0.9%) | 4275 (6.4%) | 959 (6.7%) | 2 (0.4%) | 0 (0.0%) | 5323 (5.8%) | |
| Â Â Â Piedmont | 0 (0.0%) | 10 (0.1%) | 1112 (1.7%) | 967 (6.7%) | 22 (4.2%) | 0 (0.0%) | 2111 (2.3%) | |
| Â Â Â Tuscany | 0 (0.0%) | 80 (0.8%) | 2270 (3.4%) | 877 (6.1%) | 48 (9.2%) | 1 (2.9%) | 3276 (3.6%) | |
| Â Â Â Veneto | 0 (0.0%) | 56 (0.6%) | 747 (1.1%) | 228 (1.6%) | 5 (1.0%) | 0 (0.0%) | 1036 (1.1%) | |
| Â Â Â Washington | 0 (0.0%) | 647 (6.5%) | 6961 (10.4%) | 966 (6.7%) | 1 (0.2%) | 0 (0.0%) | 8575 (9.4%) | |
| Â Â Â Other | 4 (57.1%) | 6116 (61.9%) | 26945 (40.4%) | 3848 (26.8%) | 140 (26.9%) | 12 (34.3%) | 37065 (40.5%) | |
| title_word_count | < 0.001 | |||||||
| Â Â Â Mean (SD) | 6.714 (3.039) | 6.420 (2.142) | 6.569 (2.039) | 7.091 (2.202) | 6.046 (1.981) | 5.371 (1.784) | 6.631 (2.087) | |
| Â Â Â Range | 4.000 - 11.000 | 2.000 - 22.000 | 2.000 - 23.000 | 2.000 - 18.000 | 2.000 - 13.000 | 3.000 - 10.000 | 2.000 - 23.000 | |
| title_sentement | < 0.001 | |||||||
| Â Â Â Mean (SD) | -0.049 (0.220) | 0.031 (0.102) | 0.032 (0.114) | 0.041 (0.113) | 0.027 (0.085) | 0.046 (0.107) | 0.033 (0.113) | |
| Â Â Â Range | -0.510 - 0.200 | -0.875 - 0.937 | -0.850 - 1.071 | -0.530 - 0.795 | -0.408 - 0.583 | 0.000 - 0.378 | -0.875 - 1.071 |
world_map <- map_data("world")
world_map <-
world_map %>% dplyr::mutate(region = ifelse(region == "USA", "US", region))
world_map <- world_map %>% dplyr::mutate(country = region)
wine_map_DF <-
wine_data_clean %>% dplyr::mutate(country = as.character(country)) %>%
dplyr::filter (country != "England" &
country != "US-France" &
country != "")
# Generate Summary
wmap <-
wine_data_clean %>%
dplyr::group_by(country) %>%
dplyr::summarize(
point_min = min(points, na.rm = TRUE),
point_avg = mean(points, na.rm = TRUE),
point_max = max(points, na.rm = TRUE),
price_min = min(price, na.rm = TRUE),
price_avg = mean(price, na.rm = TRUE),
price_max = max(price, na.rm = TRUE)
) %>%
dplyr::select(
c(
"country",
"point_min",
"point_avg",
"point_max",
"price_min",
"price_avg",
"price_max"
)
)
# Create data frame with bounderies and values
wine_country_map <- right_join(wmap, world_map, by = "country")
#scale_fill_viridis_c(option = "C", limits = c(80, 100)) + theme_map() + ggtitle(title)
.map_from_attribute <- function(att, title) {
return(
ggplot(wine_country_map, aes_string(map_id = "country", fill = att)) +
geom_map(map = wine_country_map, color = "white") +
expand_limits(x = wine_country_map$long, y = wine_country_map$lat) +
scale_fill_viridis_c(option = "C") + theme_map() + ggtitle(title) +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title = element_text(size=20)) +
theme(legend.title = element_blank())
)
}p <- ggplot(wine_data_clean, aes(points, price, color = point_cat)) +
geom_point() +
theme_clean() +
labs(title = "Price vs Points")
p1 <- ggMarginal(p, type="histogram", fill="slateblue")
p1 wine_data_clean %>% ggplot(aes(x = reorder(color_lump, points), y = points, fill = color_lump)) +
geom_boxplot() +
xlab("Color") +
theme_clean() +
theme(legend.position = "none") +
ggtitle("Wine Score vs color. Faceted by Gender and presence of accents in title") + facet_wrap(~ country_lump)ggplot(wine_data_clean, aes(log(price), points, color = point_cat)) +
geom_point() +
theme_fivethirtyeight() +
labs(title = "Score vs log of Price") +
facet_wrap(~ taster_gender)ggplot(wine_data_clean, aes(log(price), log(points), color = point_cat)) +
geom_point() +
theme_fivethirtyeight() +
labs(title = "Score vs log of Price") +
facet_wrap(~ country_lump)ggplot(wine_data_clean, aes(x = price, y = country_lump, color = point_cat)) +
geom_jitter(alpha=1/2) + ggtitle("Price and Country Colored by Point_Cat")ggplot(wine_data_clean, aes(x = log(price), y = country_lump, color = point_cat)) +
geom_jitter(alpha=1/2) + ggtitle("log(Price) and Country Colored by Point_Cat")## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] wordcloud_2.6 RColorBrewer_1.1-2
## [3] SnowballC_0.6.0 tm_0.7-6
## [5] NLP_0.2-0 sentimentr_2.8.0
## [7] PerformanceAnalytics_1.5.3 xts_0.11-2
## [9] zoo_1.8-6 stringi_1.4.3
## [11] plyr_1.8.4 ggmap_3.0.0
## [13] GGally_1.4.0 ggthemes_4.2.0
## [15] ggExtra_0.9 kableExtra_1.1.0
## [17] knitr_1.25 forcats_0.4.0
## [19] stringr_1.4.0 dplyr_0.8.3
## [21] purrr_0.3.3 readr_1.3.1
## [23] tidyr_1.0.0 tibble_2.1.3
## [25] ggplot2_3.2.1 tidyverse_1.2.1
## [27] arsenal_3.3.0 here_0.1
## [29] conflicted_1.0.4
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-141 bitops_1.0-6 lubridate_1.7.4
## [4] webshot_0.5.1 httr_1.4.1 rprojroot_1.3-2
## [7] tools_3.6.1 backports_1.1.5 R6_2.4.1
## [10] lazyeval_0.2.2 colorspace_1.4-1 withr_2.1.2
## [13] tidyselect_0.2.5 compiler_3.6.1 cli_1.1.0
## [16] rvest_0.3.5 textshape_1.6.0 xml2_1.2.2
## [19] labeling_0.3 slam_0.1-46 scales_1.0.0
## [22] quadprog_1.5-7 digest_0.6.22 rmarkdown_1.16
## [25] jpeg_0.1-8.1 pkgconfig_2.0.3 htmltools_0.4.0
## [28] maps_3.3.0 highr_0.8 fastmap_1.0.1
## [31] rlang_0.4.1 readxl_1.3.1 rstudioapi_0.10
## [34] shiny_1.4.0 generics_0.0.2 jsonlite_1.6
## [37] qdapRegex_0.7.2 magrittr_1.5 textclean_0.9.3
## [40] Matrix_1.2-17 Rcpp_1.0.3 munsell_0.5.0
## [43] lifecycle_0.1.0 yaml_2.2.0 grid_3.6.1
## [46] parallel_3.6.1 promises_1.1.0 crayon_1.3.4
## [49] miniUI_0.1.1.1 lattice_0.20-38 splines_3.6.1
## [52] haven_2.2.0 hms_0.5.2 zeallot_0.1.0
## [55] pillar_1.4.2 rjson_0.2.20 reshape2_1.4.3
## [58] glue_1.3.1 evaluate_0.14 data.table_1.12.6
## [61] modelr_0.1.5 png_0.1-7 vctrs_0.2.0
## [64] lexicon_1.2.1 httpuv_1.5.2 RgoogleMaps_1.4.4
## [67] cellranger_1.1.0 gtable_0.3.0 reshape_0.8.8
## [70] assertthat_0.2.1 xfun_0.10 mime_0.7
## [73] syuzhet_1.0.4 xtable_1.8-4 broom_0.5.2
## [76] later_1.0.0 survival_2.44-1.1 viridisLite_0.3.0
## [79] memoise_1.1.0 ellipsis_0.3.0